Executive Summary


Full Report

Initial Data Analysis (IDA)

Creating Data Frames for each Suburb

house_scraping<- function( location = "2151/Parramatta/"){
  # adapted from https://embracingtherandom.com/r/web-scraping/rent-scraping/
  
  
  # determine how many pages to scroll through 

   url <- paste0("https://www.auhouseprices.com/sold/list/NSW/", 
                location, 
                "1/?type=townhouse&ymin=0&ymax=0&bmin=0&bmax=0&pmin=0&pmax=0&sort=date&kw=") # type set to townhouse, no other filtering
  
  webpage <- read_html(url)
  
  # get the number of properties and the number of property displayed on each page 
  find_page_number <- webpage  %>%  html_nodes("h2") %>%  html_text() 
  find_page_number <- find_page_number[1]
  numbers <- as.numeric(regmatches(find_page_number, gregexpr("[0-9]+", find_page_number))[[1]])
  end_page <- ceiling(numbers[3] / numbers[2]) # number of total properties / number on page  = total number of pages
  
  
  df <- NULL
  
  for (thispage in c(1:end_page)){
    
    if (thispage %% 5 == 0){
      print(paste0( "Processing page ", thispage) )
    }
    
    # get website text
    url <- paste0("https://www.auhouseprices.com/sold/list/NSW/", 
                  location, 
                  thispage, 
                  "/?type=townhouse&ymin=0&ymax=0&bmin=0&bmax=0&pmin=0&pmax=0&sort=date&kw=") # type set to townhouse, no other filtering
    
    webpage <- read_html(url)
    
    result <- webpage  %>%  html_nodes("li") %>%  html_text() 
    
    # end of the relevant content 
    result <-  result[ 1: grep("current", result) ]
    # remove the redundant "listed price" 
    result <-  result[ !grepl("List", result) ]
    # remove the price listed with rent
    result <-  result[ !grepl("Rent", result) ]
    
    # filter information on price and number of bedroom/bathroom/carspace
    price_bedroom  <- result[ grep("\\$", result)]
    price_bedroom <- strsplit( price_bedroom , "\\$")
    bedroom <- lapply(price_bedroom, `[`, 1)
    bedroom <- strsplit(unlist( trimws( bedroom) ) , "\\s+")
    
    price <-  lapply(price_bedroom, `[`, 2)
    price <- trimws(price)
    price <- as.numeric(gsub(",","", price ))
    
    
    # filter information on sold month and year
    # note sometimes the price is not listed , therefore only get the ones with the price 
    timesold  <- result[ grep("\\$", result)-1]
    timesold <-  trimws( gsub("Sold on","", timesold )) 
    
    # whether to use day month year or just month year
    timesold <- lapply(timesold , function(x){
      check_format <- strsplit(x, "\\s")
      if (length(check_format[[1]]) == 3){
        x <- dmy(x)
      }else if (length(check_format[[1]]) == 2){
        x <- my(x)
      }else{
        x <-  as.Date(paste0(x, "-01-01"))
      }
      x
    })
    timesold <- do.call("c", timesold)
    
    # get address of these properties
    address <- webpage  %>%  html_nodes("h4") %>%  html_text() 
    # end of the relevant content 
    address <-  address[ 1: grep("Auction History", address) -1 ]
    
    
    #decide which address contain sold price  
    sold_info <- grep("Sold on", result) #entry with sold info
    price_info <- grep("\\$", result) #entry with price info
    contain_price <- sold_info  %in% c(price_info-1) #for every sold entry, the immediate next row should be price, if not, then this sold entry does not have price record 
    address <- address[contain_price] #only record those property that has price recorded
    
    temp_df <- data.frame( address = address, 
                           bedroom = as.numeric( unlist( lapply( bedroom, `[`, 1) ) ) , 
                           bathroom = as.numeric(  unlist( lapply( bedroom, `[`, 2) )) ,  
                           carspace =  as.numeric( unlist( lapply( bedroom, `[`, 3) )), 
                           soldprice = price ,
                           yearsold =timesold )
    
    df <- rbind(df, temp_df)
  }
  
  return(df)
}

 
 
# suburb name with space need to be joined with "+" sign 
df_parramatta <- house_scraping( location = "2150/parramatta/")
## [1] "Processing page 5"
## [1] "Processing page 10"
## [1] "Processing page 15"
## [1] "Processing page 20"
## [1] "Processing page 25"
## [1] "Processing page 30"
## [1] "Processing page 35"
df_merrylands <- house_scraping( location = "2160/merrylands/")
## [1] "Processing page 5"
## [1] "Processing page 10"
## [1] "Processing page 15"
## [1] "Processing page 20"
## [1] "Processing page 25"
## [1] "Processing page 30"
## [1] "Processing page 35"
## [1] "Processing page 40"
df_auburn <- house_scraping( location = "2144/auburn/")
## [1] "Processing page 5"
## [1] "Processing page 10"
## [1] "Processing page 15"
## [1] "Processing page 20"
## [1] "Processing page 25"
## [1] "Processing page 30"
## [1] "Processing page 35"
## [1] "Processing page 40"
## [1] "Processing page 45"
## [1] "Processing page 50"
## [1] "Processing page 55"
## [1] "Processing page 60"
df_eastwood <- house_scraping( location = "2122/eastwood/")
## [1] "Processing page 5"
## [1] "Processing page 10"
## [1] "Processing page 15"
df_granville <- house_scraping( location = "2142/granville/")
## [1] "Processing page 5"
## [1] "Processing page 10"
## [1] "Processing page 15"
## [1] "Processing page 20"
## [1] "Processing page 25"

Writing longitude and latitude into dataframe

l_parramatta <- df_parramatta%>% geocode(address, method = 'arcgis', lat=latitude, long=longitude)
## Passing 354 addresses to the ArcGIS single address geocoder
## Query completed in: 194.1 seconds
l_merrylands <- df_merrylands%>% geocode(address, method = 'arcgis', lat=latitude, long=longitude)
## Passing 407 addresses to the ArcGIS single address geocoder
## Query completed in: 229.6 seconds
l_auburn <- df_auburn%>% geocode(address, method = 'arcgis', lat=latitude, long=longitude)
## Passing 577 addresses to the ArcGIS single address geocoder
## Query completed in: 321.1 seconds
l_eastwood <- df_eastwood%>% geocode(address, method = 'arcgis', lat=latitude, long=longitude)
## Passing 183 addresses to the ArcGIS single address geocoder
## Query completed in: 109.3 seconds
l_granville <- df_granville%>% geocode(address, method = 'arcgis', lat=latitude, long=longitude)
## Passing 280 addresses to the ArcGIS single address geocoder
## Query completed in: 150.1 seconds

Function to calculate distance to train station

data_distance_between <- function(lat, lon, fixed_lat, fixed_lon) {
  dist <- distHaversine(c(lon, lat), c(fixed_lon, fixed_lat))/1000
  return(dist)
}  # function that returns the distance between places within the dataframe using longitude and latitude column, and a fixed location. Now, we just need to substitute fixed_lat and fixed_lon with the desired locations for train stations etc to incorporate into the data

# used google maps for all longitudes and latitudes

parramatta_lat <- -33.8175
parramatta_lon <- 151.0050
l_parramatta_distance <- data.frame(l_parramatta, "distance_to_train_station(km)" = apply(l_parramatta[,c("latitude","longitude")], 1, function(x) data_distance_between(x[1], x[2], parramatta_lat, parramatta_lon)))

merrylands_lat <- -33.8363
merrylands_lon <- 150.9926
l_merrylands_distance <- data.frame(l_merrylands, "distance_to_train_station(km)" = apply(l_merrylands[,c("latitude","longitude")], 1, function(x) data_distance_between(x[1], x[2], merrylands_lat, merrylands_lon)))

auburn_lat <- -33.8490
auburn_lon <- 151.0329
l_auburn_distance <- data.frame(l_auburn, "distance_to_train_station(km)" = apply(l_auburn[,c("latitude","longitude")], 1, function(x) data_distance_between(x[1], x[2], auburn_lat, auburn_lon)))

eastwood_lat <- -33.7899
eastwood_lon <- 151.0821
l_eastwood_distance <- data.frame(l_eastwood, "distance_to_train_station(km)" = apply(l_eastwood[,c("latitude","longitude")], 1, function(x) data_distance_between(x[1], x[2], eastwood_lat, eastwood_lon)))


granville_lat <- -33.8326
granville_lon <- 151.0120
l_granville_distance <- data.frame(l_granville, "distance_to_train_station(km)" = apply(l_granville[,c("latitude","longitude")], 1, function(x) data_distance_between(x[1], x[2], granville_lat, granville_lon)))

# c("latitude", "longitude") takes only the latitude and longitude so x[1], x[2] takes 1st and 2nd element of the row x which is latitude and longitude 

# The apply function applies the data_distance_between function to each row of the l_chatswood dataframe. The apply function takes three arguments: the dataframe subsetted to the latitude and longitude columns (using l_chatswood[, c("latitude", "longitude")]), the 1 argument to apply the function to each row, and a function that takes the latitude and longitude values of a row as input and calculates the distance to the fixed point.

# x is a row in the dataframe. x[1] is first element of x which is longitude

Classing distance

l_parramatta_distance$distance_class <- cut(l_parramatta_distance$"distance_to_train_station.km.",breaks=c(0,0.250,0.500,0.750,1.000,1.250,1.500,1.750,2.000,2.250,2.500,3.000,3.250,3.500,3.750, 4.000))
l_merrylands_distance$distance_class <- cut(l_merrylands_distance$"distance_to_train_station.km.",breaks=c(0,0.250,0.500,0.750,1.000,1.250,1.500,1.750,2.000,2.250,2.500, 3.000,3.250,3.500,3.750, 4.000))
l_auburn_distance$distance_class <- cut(l_auburn_distance$"distance_to_train_station.km.",breaks=c(0,0.250,0.500,0.750,1.000,1.250,1.500,1.750,2.000,2.250,2.500,3.000,3.250,3.500,3.750, 4.000))
l_eastwood_distance$distance_class <- cut(l_eastwood_distance$"distance_to_train_station.km.",breaks=c(0,0.250,0.500,0.750,1.000,1.250,1.500,1.750,2.000,2.250,2.500,3.000,3.250,3.500,3.750, 4.000))
l_granville_distance$distance_class <- cut(l_granville_distance$"distance_to_train_station.km.",breaks=c(0,0.250,0.500,0.750,1.000,1.250,1.500,1.750,2.000,2.250,2.500,3.000,3.250,3.500,3.750, 4.000))

Combining Data

combined_df <-rbind(l_parramatta_distance, l_merrylands_distance, l_auburn_distance, l_eastwood_distance, l_granville_distance)

Filtering Data

combined_df_1bed <-filter(combined_df, bedroom ==1)
combined_df_2bed <-filter(combined_df, bedroom ==2)
combined_df_3bed <-filter(combined_df, bedroom ==3)
combined_df_4bed <-filter(combined_df, bedroom ==4)
combined_df_5bed <-filter(combined_df, bedroom ==5)
par(mfrow=c(1,2))
ggplot(combined_df_1bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 1 Bedroom", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

ggplot(combined_df_1bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
  labs(title = "Sold Price vs Distance from Train Station for 1 Bedroom", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

ggplot(combined_df_2bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_2bed$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  185000  345000  422000  454197  535750 1470000
ggplot(combined_df_2bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
  labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_2bed$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  185000  345000  422000  454197  535750 1470000
ggplot(combined_df_3bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_3bed$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     575  410000  536000  574283  685000 2020000
ggplot(combined_df_3bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
  labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_3bed$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     575  410000  536000  574283  685000 2020000
ggplot(combined_df_4bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_4bed$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  100000  525000  672500  704376  840000 3000000
ggplot(combined_df_4bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
  labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_4bed$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  100000  525000  672500  704376  840000 3000000
ggplot(combined_df_5bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_5bed$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  595000  769004 1010000 1175092 1514000 2090000
ggplot(combined_df_5bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
  labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  theme(plot.title = element_text(hjust=0.25))

summary(combined_df_5bed$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  595000  769004 1010000 1175092 1514000 2090000

Filtering Data by Carspaces and Bedrooms

combined_df_1bed_1car <-filter(combined_df, bedroom ==1, carspace == 1)

combined_df_2bed_1car <-filter(combined_df, bedroom ==2, carspace == 1)

combined_df_2bed_2car <-filter(combined_df, bedroom ==2, carspace == 2)

combined_df_3bed_1car <-filter(combined_df, bedroom ==3, carspace == 1)

combined_df_3bed_2car <-filter(combined_df, bedroom ==3, carspace == 2)

combined_df_3bed_3car <-filter(combined_df, bedroom ==3, carspace == 3)

combined_df_3bed_4car <-filter(combined_df, bedroom ==3, carspace == 4)

combined_df_4bed_1car <-filter(combined_df, bedroom ==4, carspace == 1)

combined_df_4bed_2car <-filter(combined_df, bedroom ==4, carspace == 2)

combined_df_4bed_3car <-filter(combined_df, bedroom ==4, carspace == 3)

combined_df_4bed_4car <-filter(combined_df, bedroom ==4, carspace == 4)

combined_df_5bed_1car <-filter(combined_df, bedroom ==5, carspace == 1)

combined_df_5bed_2car <-filter(combined_df, bedroom ==5, carspace == 2)

combined_df_5bed_3car <-filter(combined_df, bedroom ==5, carspace == 3)

1 bedroom

ggplot(combined_df_1bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 1 Bedroom and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1bed_1car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  243000  334750  426500  426500  518250  610000

2 bedrooms

ggplot(combined_df_2bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2bed_1car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  230000  343875  419500  455068  530000 1470000
ggplot(combined_df_2bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2bed_2car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  225000  365000  460200  483116  585500 1120000

3 bedrooms

ggplot(combined_df_3bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3bed_1car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   73000  393000  515000  541211  645375 2020000
ggplot(combined_df_3bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3bed_2car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     575  425750  595000  604433  700000 1950000
ggplot(combined_df_3bed_3car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 3 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3bed_2car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     575  425750  595000  604433  700000 1950000
ggplot(combined_df_3bed_4car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 4 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3bed_4car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  429500  471750  488000  547062  575000  770000

4 bedrooms

ggplot(combined_df_4bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_4bed_1car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  320000  440000  565000  613430  701500 1625000
ggplot(combined_df_4bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_4bed_2car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  160000  550000  710000  721376  851000 1950000
ggplot(combined_df_4bed_3car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 3 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_4bed_3car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  100000  606000  781500  904300  832250 3000000
ggplot(combined_df_4bed_4car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 4 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_4bed_4car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  456000  580250  595000  629000  691250  800000

5 bedrooms

ggplot(combined_df_5bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_5bed_1car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  615000  615000  615000  615000  615000  615000
ggplot(combined_df_5bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_5bed_2car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  595000  842500 1165000 1231101 1571000 2090000
ggplot(combined_df_5bed_3car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms and 3 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_5bed_3car$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 

Creating a column for Year

combined_df$Year <- as.factor(format(as.Date(combined_df$yearsold), "%Y"))
# Filtering by year
combined_df_0.00 <-filter(combined_df, distance_class == "(0,0.25]")
combined_df_0.25 <-filter(combined_df, distance_class == "(0.25,0.5]")
combined_df_0.50 <-filter(combined_df, distance_class == "(0.5,0.75]")
combined_df_0.75 <-filter(combined_df, distance_class == "(0.75,1]")
combined_df_1.00 <-filter(combined_df, distance_class == "(1,1.25]")
combined_df_1.25 <-filter(combined_df, distance_class == "(1.25,1.5]")
combined_df_1.50 <-filter(combined_df, distance_class == "(1.5,1.75]")
combined_df_1.75 <-filter(combined_df, distance_class == "(1.75,2]")
combined_df_2.00 <-filter(combined_df, distance_class == "(2,2.25]")
combined_df_2.25 <-filter(combined_df, distance_class == "(2.25,2.5]")
combined_df_2.50 <-filter(combined_df, distance_class == "(2.5,2.75]")
combined_df_2.75 <-filter(combined_df, distance_class == "(2.75,3]")
combined_df_3.00 <-filter(combined_df, distance_class == "(3,3.25]")
combined_df_3.25 <-filter(combined_df, distance_class == "(3.25,3.5]")
combined_df_3.50 <-filter(combined_df, distance_class == "(3.5,3.75]")
combined_df_3.75 <-filter(combined_df, distance_class == "(3.75,4]")
ggplot(combined_df_0.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 0 to 0.25km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_0.00$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  260000  337750  412500  489812  552500 1150000
ggplot(combined_df_0.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 0.25 to 0.50km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_0.25$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  165000  365000  506000  518858  635000 1545000
ggplot(combined_df_0.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 0.50 to 0.75km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_0.50$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  185000  395000  535500  569420  685000 2020000
ggplot(combined_df_0.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 0.75 to 1.00km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_0.75$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   65000  395000  546500  585952  700750 1950000
ggplot(combined_df_1.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 1.00 to 1.25km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1.00$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     575  380000  481000  546800  660000 1950000
ggplot(combined_df_1.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 1.25 to 1.50km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1.25$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  100000  395000  490000  541084  647498 3000000
ggplot(combined_df_1.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 1.50 to 1.75km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1.50$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  310000  435000  571500  595094  652750 2090000
ggplot(combined_df_1.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 1.75 to 2.00km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1.75$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  286000  406000  560000  603858  757000 1515000
ggplot(combined_df_2.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 2.00 to 2.25km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2.00$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  160000  400000  491000  499950  605000  800000
ggplot(combined_df_2.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 2.25 to 2.50km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2.25$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  270000  377000  465000  508040  637500 1230000
ggplot(combined_df_2.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 2.50 to 2.75km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2.50$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 
ggplot(combined_df_2.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 2.75 to 3.00km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2.75$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 
ggplot(combined_df_3.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 3.00 to 3.25km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3.00$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  305000  484500  710000  706013  773500 1777000
ggplot(combined_df_3.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 3.25 to 3.75km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3.25$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  555000  555000  555000  555000  555000  555000
ggplot(combined_df_3.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 3.50 to 3.75km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3.50$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  250000  262000  274000  274000  286000  298000
ggplot(combined_df_3.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
  labs(title = "Sold Price vs year for townhouses 3.75 to 4.00km from train station", x="Year", y="Selling Price (x$100000)")+
  theme_bw()+
  theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3.75$soldprice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  250000  316000  360000  398110  455250  664000